library(here) # for reproducible paths
library(SingleCellExperiment)
library(scater) # For qc
library(org.Mm.eg.db) # To annotate the genenames
sce <- readRDS(here("processed", "sce.RDS"))
We start by removing genes not expressed in any cell
genes_beforeqc <- dim(sce)[1]
keep_feature <- rowSums(counts(sce) > 0) > 0
sce <- sce[keep_feature, ]
genes_beforeqc -dim(sce)[1]
## [1] 6511
6511 cells are not expressed in any cell.
First are sorted the genenames and gene symbols, because the default ensembl notation is not very handy. And then save the mitochondrial genes as such.
if(!file.exists(here("processed", "sce_preliminary.RDS"))){
# obtain full genenames
genename <- mapIds(org.Mm.eg.db, keys = rownames(sce), keytype = "ENSEMBL", column = c("GENENAME"))
# Use the symbols as rownames
# first make gene names unique
# TODO: save duplicate gene name list
symb_unique <- uniquifyFeatureNames(rownames(sce), rowData(sce)[, "Symbol"])
# Now they can be used as rownames
rownames(sce) <- symb_unique
# Add full gene names and the uniuqe symbols to the rowdata
rowData(sce)$symb_uniq <- symb_unique
rowData(sce)$gene_name <- genename
# Subset the mitochondrial genes
is_mito <- grepl("^mt-", rownames(sce))
}
<<<<<<< Updated upstream
## 'select()' returned 1:many mapping between keys and columns
Then we can use the scater package to add the quality per cell ## First try log for low thresholds but not for high. In low this prevents negative thresholds
=======Then we can use the scater package to add the quality per cell
>>>>>>> Stashed changesif(!file.exists(here("processed", "sce_preliminary.RDS"))){
sce <- addPerCellQC(sce, subsets = list(mt=is_mito))
# Automated outlier detection
outlier_lib_low <- isOutlier(sce$total, log = TRUE, type = "lower")
outlier_expr_low <-
isOutlier(sce$detected, log = TRUE, type = "lower")
outlier_lib_high <- isOutlier(sce$total, type = "higher")
outlier_expr_high <-
isOutlier(sce$detected, type = "higher")
outlier_mt <- isOutlier(sce$subsets_mt_percent, type = "higher")
# total
outlier <-
outlier_lib_low |
outlier_expr_low |
outlier_lib_high | outlier_expr_high | outlier_mt
# See how many cells are detected as outliers
DataFrame(
lib_size_high = sum(outlier_lib_high),
expression_high = sum(outlier_expr_high),
lib_size_low = sum(outlier_lib_low),
expression_low = sum(outlier_expr_low),
mt_pct = sum(outlier_mt),
total = sum(outlier)
)
# And the thresholds
attr(outlier_lib_high, "thresholds")
attr(outlier_expr_high, "thresholds")
attr(outlier_lib_low, "thresholds")
attr(outlier_expr_low, "thresholds")
attr(outlier_mt, "thresholds")
# TODO do the analysis separating by batch?
# Add if it is an outlier to the metadata
sce$outlier <- outlier
# Save the object at this stage with the label "preliminary analysis"
# As only annotation addition has been done, without deleting anything yet.
saveRDS(sce, here("processed", "sce_preliminary.RDS"))
}else{
sce <- readRDS(here("processed", "sce_preliminary.RDS"))
}
Diagnostic plots to visualize the data distribution.
plotColData(sce, x="Sample", y="sum", colour_by="outlier") +
ggtitle("Total count") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="sum", colour_by="outlier") +
scale_y_log10() + ggtitle("Total count log scale") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="detected", colour_by="outlier") +
scale_y_log10() + ggtitle("Detected Genes") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="subsets_mt_percent", colour_by="outlier") + ggtitle("Mitocchondrial percentatge") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
hist(
sce$total,
breaks = 100
)
hist(
sce$detected,
breaks = 100
)
plotColData(sce, x="sum", y="subsets_mt_percent", colour_by="outlier")
plotColData(sce, x="sum", y="detected", colour_by="outlier")
plotColData(sce, x="sum", y="detected", colour_by="Sample")
Here we run a PCA using the information in the metadata instead of the gene expression. It is usefull to visualize the QC parametres.
sce <- runColDataPCA(sce, variables = c("sum","detected", "subsets_mt_percent"))
plotReducedDim(sce, dimred="PCA_coldata", colour_by = "outlier")
plotReducedDim(sce, dimred="PCA_coldata", colour_by = "Sample")
sce$chip <- as.character(sce$chip)
plotReducedDim(sce, dimred="PCA_coldata", colour_by = "chip")